#loading packages
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.5 ✓ stringr 1.4.0
## ✓ tidyr 1.1.2 ✓ forcats 0.5.0
## ✓ readr 1.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date() masks base::date()
## x dplyr::filter() masks stats::filter()
## x lubridate::intersect() masks base::intersect()
## x dplyr::lag() masks stats::lag()
## x lubridate::setdiff() masks base::setdiff()
## x lubridate::union() masks base::union()
library(ggridges) # for joy plots
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(gganimate) # for adding animation layers to ggplots
library(gifski) # for creating the gif (don't need to load this library every time,but need it installed)
#loading data
spotify <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv')
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## track_id = col_character(),
## track_name = col_character(),
## track_artist = col_character(),
## track_album_id = col_character(),
## track_album_name = col_character(),
## track_album_release_date = col_character(),
## playlist_name = col_character(),
## playlist_id = col_character(),
## playlist_genre = col_character(),
## playlist_subgenre = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
spotify_rap <- spotify %>%
filter(playlist_genre == "rap")
randb <- spotify %>%
filter(playlist_genre == "r&b") %>%
select(-track_id, - track_album_id, -playlist_id, -playlist_name) %>%
filter(track_popularity >= 75)
Why did we do an analysis on spotify? Why is the data significant & why should people care? Introduce the data to audience
Using this dataset, we hope to study to technicalities of music anbd
Aside from personal interest…
Data retrieved from github, (add link). https://github.com/rfordatascience/tidytuesday/blob/faca0b6bd282998693007c329e3f4b917a5fd7a8/data/2020/2020-01-21/readme.md Who collected the data and what prupose does it serve? Who funded the data collection? Any possible biases? What are teh implications of the analysis of this dataset, ethical or otherwise?
genre_pop <- spotify %>%
filter(track_popularity >= 75) %>%
mutate(ymd_release = ymd(track_album_release_date),
year = year(ymd_release)) %>%
group_by(year, playlist_genre) %>%
summarize(avg_popularity = mean(track_popularity)) %>%
ggplot(aes(x = year, y = avg_popularity, color = playlist_genre)) +
geom_point() +
labs(title="Average song popularity by genre per year",
subtitle = "Overall, as music becomes more accessible, average peopulatity across all genres is on the rise.",
x = "",
y = "",
color = "Genre") +
theme_classic()
## Warning: Problem with `mutate()` input `ymd_release`.
## ℹ 68 failed to parse.
## ℹ Input `ymd_release` is `ymd(track_album_release_date)`.
## Warning: 68 failed to parse.
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
ggplotly(genre_pop)
prelim_graph <- spotify %>%
ggplot(aes(y = playlist_genre, x = track_popularity)) +
labs(title = "Song Popularity by Genre",
x = "", y = "",
subtitle = "Song popularity is measured from 0-100, with higher numbers being indiciative of more popularity.\nHighest median popularities belong to pop and latin with an overall median popularity of 40",
caption = "Alex Ismail, Malek Kaloti, Brian Lee") +
theme_classic() +
theme(plot.title.position = "plot",
plot.title = element_text(size = 20, face = "bold"),
plot.subtitle = element_text(size = 10, face = "italic")) +
geom_boxplot() +
geom_vline(aes(xintercept = median(track_popularity, na.rm = TRUE)), color = "blue")
prelim_graph
spotify_rap %>%
mutate(Rounded_Danceability = round(danceability, digits = 1),
Rounded_Energy = round(energy, digits = 1),
Rounded_Speechiness = round(speechiness, digits = 1),
Rounded_Instrumental = round(instrumentalness, digits = 1),
popular = track_popularity > 75) %>%
pivot_longer(cols = starts_with("Rounded"),
names_to = "Stat1",
values_to = "Rounded_Value") %>%
group_by(Stat1, Rounded_Value) %>%
summarize(Pop_Rate = mean(popular)*100) %>%
mutate(Stat = fct_recode(Stat1,
Danceability = "Rounded_Danceability",
Energy = "Rounded_Energy",
Speechiness = "Rounded_Speechiness",
Instrumental = "Rounded_Instrumental")) %>%
ggplot(aes(x = Rounded_Value, y = Pop_Rate)) +
geom_line(aes(color = Stat)) +
labs(title = "Popularity of Rap Songs by Song Characteristic",
x = "", y = "Percent Popular", color = "Song Statistic") +
theme_classic() +
theme(plot.title.position = "plot",
plot.title = element_text(size = 20, face = "bold"),
plot.subtitle = element_text(size = 10, face = "italic"))
## `summarise()` regrouping output by 'Stat1' (override with `.groups` argument)
## Warning: Problem with `mutate()` input `Stat`.
## ℹ Unknown levels in `f`: Rounded_Energy, Rounded_Speechiness, Rounded_Instrumental
## ℹ Input `Stat` is `fct_recode(...)`.
## ℹ The error occurred in group 1: Stat1 = "Rounded_Danceability".
## Warning: Unknown levels in `f`: Rounded_Energy, Rounded_Speechiness,
## Rounded_Instrumental
## Warning: Problem with `mutate()` input `Stat`.
## ℹ Unknown levels in `f`: Rounded_Danceability, Rounded_Speechiness, Rounded_Instrumental
## ℹ Input `Stat` is `fct_recode(...)`.
## ℹ The error occurred in group 2: Stat1 = "Rounded_Energy".
## Warning: Unknown levels in `f`: Rounded_Danceability, Rounded_Speechiness,
## Rounded_Instrumental
## Warning: Problem with `mutate()` input `Stat`.
## ℹ Unknown levels in `f`: Rounded_Danceability, Rounded_Energy, Rounded_Speechiness
## ℹ Input `Stat` is `fct_recode(...)`.
## ℹ The error occurred in group 3: Stat1 = "Rounded_Instrumental".
## Warning: Unknown levels in `f`: Rounded_Danceability, Rounded_Energy,
## Rounded_Speechiness
## Warning: Problem with `mutate()` input `Stat`.
## ℹ Unknown levels in `f`: Rounded_Danceability, Rounded_Energy, Rounded_Instrumental
## ℹ Input `Stat` is `fct_recode(...)`.
## ℹ The error occurred in group 4: Stat1 = "Rounded_Speechiness".
## Warning: Unknown levels in `f`: Rounded_Danceability, Rounded_Energy,
## Rounded_Instrumental
spotify %>%
mutate(track_name_lower = str_to_lower(track_name),
remix = str_detect(track_name_lower, "Remix"),
feature = str_detect(track_name_lower, "feat"),
ma_prep = remix|feature,
ma_prep2 = replace_na(ma_prep, FALSE),
multiple_artists = if_else(ma_prep2, true = "Multiple Artists", false = "One Artist"),
popular = track_popularity > 75) %>%
group_by(multiple_artists, playlist_genre) %>%
summarize(prop_pop = mean(popular)*100) %>%
mutate(genre = fct_relevel(playlist_genre, "rap")) %>%
ggplot() +
geom_col(aes(x = multiple_artists, y = prop_pop)) +
facet_wrap(~genre) +
labs(title = "Popularity of Songs Containing Mulitple Artists Across Genre",
x = "", y = "Percent of Songs Popular") +
theme_classic() +
theme(plot.title.position = "plot",
plot.title = element_text(size = 20, face = "bold"),
plot.subtitle = element_text(size = 10, face = "italic"))
## `summarise()` regrouping output by 'multiple_artists' (override with `.groups` argument)
In this section, I want to take a closer look at one of my favorite genres of music, R&B. I think I love it so much because it’s often good music to unwind to – it’s smooth, slow, and relaxing. I also love its versatility! R&B can fit the mood of anything from a gloomy, rainy day to a bright, sunny day. But why? What characteristics make R&B such a great genre to listen to? Using the Spotify dataset and some visualizations which look at the specific characteristics of the most popular R&B songs (songs with a popularity rating of above 75), I hope to come closer to answering these questions.
randb %>%
group_by(track_name) %>%
arrange(desc(track_popularity)) %>%
head(12)
## # A tibble: 12 x 19
## # Groups: track_name [8]
## track_name track_artist track_popularity track_album_name track_album_rel…
## <chr> <chr> <dbl> <chr> <chr>
## 1 ROXANNE Arizona Zer… 99 ROXANNE 2019-10-10
## 2 ROXANNE Arizona Zer… 99 ROXANNE 2019-10-10
## 3 The Box Roddy Ricch 98 Please Excuse M… 2019-12-06
## 4 Memories Maroon 5 98 Memories 2019-09-20
## 5 Blinding … The Weeknd 98 Blinding Lights 2019-11-29
## 6 Blinding … The Weeknd 98 Blinding Lights 2019-11-29
## 7 The Box Roddy Ricch 98 Please Excuse M… 2019-12-06
## 8 Tusa KAROL G 98 Tusa 2019-11-07
## 9 Memories Maroon 5 98 Memories 2019-09-20
## 10 Circles Post Malone 98 Hollywood's Ble… 2019-09-06
## 11 Don't Sta… Dua Lipa 97 Don't Start Now 2019-10-31
## 12 everythin… Billie Eili… 97 everything i wa… 2019-11-13
## # … with 14 more variables: playlist_genre <chr>, playlist_subgenre <chr>,
## # danceability <dbl>, energy <dbl>, key <dbl>, loudness <dbl>, mode <dbl>,
## # speechiness <dbl>, acousticness <dbl>, instrumentalness <dbl>,
## # liveness <dbl>, valence <dbl>, tempo <dbl>, duration_ms <dbl>
Above are the top 10 most popular songs in the R&B genre. We can see that all of them were released in 2019 and all categorized under my two favorite two subgenres of R&B, Urban Contemporary and Hip Pop. All of them also boast a danceability score of above 0.5, with most of them (with the exception of Maroon 5’s Memories and Billie Eilish’s everything i wanted) having energy scores of above 0.5. We can also see that across the board, all 10 songs have low speechiness and instrumentalness scores (with the exception of Billie Eilish’s everything i wanted. Interestingly, all of the songs fall within a valence of 0.2-0.6. The other characteristics are quite varied. So, for the purposes of my analysis of the R&B genre, I will only focus on the song characteristics that have clear trends across the genre – danceabiility, energy, speechiness, instrumentalness, and valence.
In the exploratory phase of my analysis of the R&B genre, the most obvious characteristic of a song in the R&B genre was a song’s subgenre. Are certain genres more likely to have more popular songs because some have more fans and listeners than others? In the density plot below, we see that this is the case – Neo-Soul and New Jack Swing have the highest quantity of popular songs.
randb %>%
ggplot(aes(x = track_popularity, fill = playlist_subgenre)) +
geom_density(alpha = 0.1) +
theme_classic() +
labs(title = "Do certain subgenres have more popular songs?",
subtitle = "This density plot only includes songs with a popularity of >=75.\nIt seems that Neo-Soul and New Jack Swing have the most popular songs.\n\nR&B Subgenre: {closest_state}",
x = "Track Popularity",
y = "",
fill = "R&B subgenre",
caption = "Visualization created by Brian Lee") +
transition_states(playlist_subgenre, transition_length = 3, state_length = 1)
#get rid of axes, make subtitle descriptive
anim_save("randb_density.gif")
knitr::include_graphics("randb_density.gif")
In the density plot above, Neo-Soul and New Jack Swing both seem to have a lot of popular songs on the lower end of the spectrum (75-85), with Urban Contemporary and Hip Pop following similar trends, but in comparison to the other two genres, their density curves are not as large, signaling that the former two genres have more songs classified as “popular” than teh latter two.
I believe that this trend could be occurring because of the huge increase in the production of hip pop and urban contemporary music. With streaming services such as Spotify making it easier than ever for small creators to attain platforms and with the advancement of technology making it easier to produce and release music from one’s own bedroom, this may be because of the oversaturation of the music industry – there are more songs being released than ever.
randb %>%
group_by(playlist_subgenre) %>%
summarize(num_of_songs = n(), avg_pop = mean(track_popularity)) %>%
knitr::kable()
## `summarise()` ungrouping output (override with `.groups` argument)
| playlist_subgenre | num_of_songs | avg_pop |
|---|---|---|
| hip pop | 281 | 82.95018 |
| neo soul | 41 | 78.90244 |
| new jack swing | 4 | 77.50000 |
| urban contemporary | 204 | 82.13725 |
Despite the large density curves, on average, hip pop and urban contemporary are slightly more popular than the Neo-Soul and New Jack Swing. Another interesting observation we can make is the sheer lack of popular songs for Neo-Soul and New Jack Swing.
A quick Google search will reveal that both Neo-Soul and New Jack Swing were subgenres of R&B that were popular during the 1980’s/90’s. Their large density curves could be due to this fact. Because the technology for household high quality handheld microphones and producing equipment was not in abundance like it is now, artists had to rely on label companies and managers for the funding to acquire the money for studios and expensive equipment, thus leading to less music being produced. Additionally, because labeling agencies and managerial agencies essentially “invested” in discovered artists whom they knew they would get a high profit margin from, the discovered artists who were given a platform by these agencies were more likely to be successful. With a smaller pool of music and more popular songs making up that small poool of music, large density curves such as the ones we see in the visualization above for Neo-Soul and New Jack Swing are possible, and could serve as an explanation for the difference in the quantity between the four genres.
As I move forward in my analysis to look at the specific characteristics of popular R&B songs, I will restrict myself to the two subgenres with more cases to look at and my person two favorite subgenres – Hip-Pop and Urban Contemporary.
randb %>%
group_by(playlist_subgenre) %>%
filter(playlist_subgenre == c("hip pop", "urban contemporary")) %>%
summarise_at(c("track_popularity", "danceability", "energy", "speechiness", "instrumentalness", "valence"), mean, na.rm = TRUE) %>%
knitr::kable()
## Warning in playlist_subgenre == c("hip pop", "urban contemporary"): longer
## object length is not a multiple of shorter object length
## Warning in playlist_subgenre == c("hip pop", "urban contemporary"): longer
## object length is not a multiple of shorter object length
| playlist_subgenre | track_popularity | danceability | energy | speechiness | instrumentalness | valence |
|---|---|---|---|---|---|---|
| hip pop | 82.62411 | 0.6985887 | 0.6000979 | 0.1304929 | 0.0120022 | 0.4780922 |
| urban contemporary | 81.98039 | 0.6823333 | 0.5401578 | 0.1340971 | 0.0135849 | 0.4606735 |
# Add graph
As it becomes easier to produce and release music from one’s own bedroom and streaming platforms such as Apple Music and Spotify increasingly making music accessible to everyone, we believe our analysis has important implications which can help listeners find new songs that they like and help platforms build algorithms that give better and more relevant song recommendations to its users.
Of course, carrelation does not equal causation. Just because the
Thanks to streaming platforms such as Spotify and Apple Music, small creators are also given a platform for creative release. Our analyses of pop, rap, and R&B, can also help small artists grow their own platforms to cater to the interests of specific audiences. In a time such as now when the consumption of art (whether it be in the form of movies, music, or television), is essential to one’s mental wellbeing, our analysis can help boost these efforts. By asking the question, “What makes a song in a given genre popular?” We have taken a close look at the specific characteristics of songs with a popularity rating of 75 or higher.